{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52ab8a56",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Execute this cell to install dependencies\n",
    "%pip install sf-hamilton[visualization]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "66d615ca",
   "metadata": {},
   "source": [
    "# PySpark example [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/spark/pyspark/notebook.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/apache/hamilton/blob/main/examples/spark/pyspark/notebook.ipynb)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "36ee33d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyspark.sql as ps\n",
    "import pandas as pd\n",
    "from pyspark.sql.functions import col, mean, stddev"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "27af76a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark_session = ps.SparkSession.builder.master(\"local[1]\").getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e317b40d",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd_df = pd.DataFrame(\n",
    "        {\n",
    "            \"spend\": [\n",
    "                10,\n",
    "                10,\n",
    "                20,\n",
    "                40,\n",
    "                40,\n",
    "                50,\n",
    "                60,\n",
    "                70,\n",
    "                90,\n",
    "                100,\n",
    "                70,\n",
    "                80,\n",
    "                90,\n",
    "                100,\n",
    "                110,\n",
    "                120,\n",
    "                130,\n",
    "                140,\n",
    "                150,\n",
    "                160,\n",
    "            ],\n",
    "            \"signups\": [\n",
    "                1,\n",
    "                10,\n",
    "                50,\n",
    "                100,\n",
    "                200,\n",
    "                400,\n",
    "                600,\n",
    "                800,\n",
    "                1000,\n",
    "                1200,\n",
    "                1400,\n",
    "                1600,\n",
    "                1800,\n",
    "                2000,\n",
    "                2200,\n",
    "                2400,\n",
    "                2600,\n",
    "                2800,\n",
    "                3000,\n",
    "                3200,\n",
    "            ],\n",
    "        }\n",
    "    )\n",
    "ps_df = spark_session.createDataFrame(pd_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "56241b2a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[foo: double]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ps_df.select(mean(col(\"spend\")).alias(\"foo\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "6f452be4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[spend: bigint, signups: bigint, foo: bigint]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ps_df.withColumn(\"foo\", ps_df['signups']*ps_df['spend'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc34219c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
